print("***1.A. Clean the Structured Data***")
***1.A. Clean the Structured Data***
print('#1.A.I. Missing value analysis and imputation. After checking for missing values, Write a function to plot the missing values in each column.')
#1.A.I. Missing value analysis and imputation. After checking for missing values, Write a function to plot the missing values in each column.
import pandas as pd
blog=pd.read_csv(r'G:\AIML Course Materials\Projects\NLP_Additional Project\TheSocialDilemma.csv')
blog.shape
(20068, 14)
blog.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:55:33 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NaN | Twitter Web App | False | Neutral |
| 1 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:53:17 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NaN | Twitter Web App | False | Neutral |
| 2 | Varun Tyagi | Goa, India | Indian | Tech Solution Artist & Hospitality Ex... | 2009-09-06 10:36:01 | 257 | 204 | 475 | False | 2020-09-16 20:51:57 | Go watch “The Social Dilemma” on Netflix!\n\nI... | NaN | Twitter for iPhone | False | Positive |
| 3 | Casey Conway | Sydney, New South Wales | Head of Diversity & Inclusion @RugbyAU | It's ... | 2012-12-28 21:45:06 | 11782 | 1033 | 12219 | True | 2020-09-16 20:51:46 | I watched #TheSocialDilemma last night. I’m sc... | ['TheSocialDilemma'] | Twitter for iPhone | False | Negative |
| 4 | Charlotte Paul | Darlington | Instagram Charlottejyates | 2012-05-28 20:43:08 | 278 | 387 | 5850 | False | 2020-09-16 20:51:11 | The problem of me being on my phone most the t... | ['TheSocialDilemma'] | Twitter for iPhone | False | Positive |
blog.tail()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20063 | scp. | NaN | “Through love, all is possible.” - SJM - See m... | 2013-02-19 00:55:12 | 431 | 193 | 32958 | False | 2020-10-09 00:25:53 | #TheSocialDilemma yalll.... this shit... we kn... | ['TheSocialDilemma'] | Twitter for iPhone | False | Negative |
| 20064 | Dono6971 | United States | Father, Husband, and a Dude|| Love Notre Dame ... | 2010-01-06 04:08:41 | 172 | 96 | 50159 | False | 2020-10-09 00:24:45 | Peeps:\n\nFind 90 minutes this weekend and wat... | NaN | Twitter for iPhone | False | Positive |
| 20065 | Remi Shores | NaN | Genderfluid / They/Them/Theirs / Queer Christi... | 2012-05-16 23:49:13 | 387 | 652 | 7885 | False | 2020-10-09 00:11:42 | So you watched #thesocialdilemma, or have been... | ['thesocialdilemma'] | Twitter Web App | False | Negative |
| 20066 | Scott the Great and Terrible | NaN | I can't recall the taste of food, nor the soun... | 2020-03-16 18:20:31 | 103 | 84 | 2976 | False | 2020-10-09 00:10:16 | Good social media advice:\n\nChoose the thing ... | ['TheSocialDilemma'] | Twitter Web App | False | Positive |
| 20067 | Get Outside Media | Telluride, CO | CREATIVE AGENCY | BRAND + CONTENT + DESIGN + P... | 2018-07-14 04:44:23 | 133 | 898 | 1131 | False | 2020-10-09 00:00:31 | Boulder director Jeff Orlowski hopes viewers o... | ['TheSocialDilemma'] | Hootsuite Inc. | False | Neutral |
blog.isnull().sum()
user_name 1 user_location 4208 user_description 1383 user_created 0 user_followers 0 user_friends 0 user_favourites 0 user_verified 0 date 0 text 0 hashtags 4297 source 0 is_retweet 0 Sentiment 0 dtype: int64
# Missing values found in columns user_name,user_location, user_description & hashtags. These columns needs to be imputed to get rid of missing values
dt=blog.copy()
dt.shape
(20068, 14)
dt.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:55:33 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NaN | Twitter Web App | False | Neutral |
| 1 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:53:17 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NaN | Twitter Web App | False | Neutral |
| 2 | Varun Tyagi | Goa, India | Indian | Tech Solution Artist & Hospitality Ex... | 2009-09-06 10:36:01 | 257 | 204 | 475 | False | 2020-09-16 20:51:57 | Go watch “The Social Dilemma” on Netflix!\n\nI... | NaN | Twitter for iPhone | False | Positive |
| 3 | Casey Conway | Sydney, New South Wales | Head of Diversity & Inclusion @RugbyAU | It's ... | 2012-12-28 21:45:06 | 11782 | 1033 | 12219 | True | 2020-09-16 20:51:46 | I watched #TheSocialDilemma last night. I’m sc... | ['TheSocialDilemma'] | Twitter for iPhone | False | Negative |
| 4 | Charlotte Paul | Darlington | Instagram Charlottejyates | 2012-05-28 20:43:08 | 278 | 387 | 5850 | False | 2020-09-16 20:51:11 | The problem of me being on my phone most the t... | ['TheSocialDilemma'] | Twitter for iPhone | False | Positive |
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def plot_missing_values(df):
# Calculating % of missing values in each column
percent_missing = dt.isnull().sum() / len(dt) * 100
# Create a DataFrame with column names and missing percentages
missing_data = pd.DataFrame({'Column': dt.columns, 'Missing %': percent_missing})
# Sort the DataFrame by missing percentage in descending order
missing_data = missing_data.sort_values('Missing %', ascending=False)
# Plot the missing values using a bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Column', y='Missing %', data=missing_data)
plt.xticks(rotation=90)
plt.xlabel('Columns')
plt.ylabel('Missing Percentage')
plt.title('Missing Values in Each Column')
plt.show()
plot_missing_values(dt)
#Imputing missing values
# Impute missing values in 'hashtags' column with 'NoHashtag'
dt['hashtags'].fillna('NoHashtag', inplace=True)
# Impute missing values in 'user_location' column with a default value
dt['user_location'].fillna('Unknown', inplace=True)
# Impute missing values in 'user_description' column with an empty string
dt['user_description'].fillna('', inplace=True)
# Impute missing values in 'user_name' column with a default value
dt['user_name'].fillna('Unknown', inplace=True)
dt.isnull().sum()
user_name 0 user_location 0 user_description 0 user_created 0 user_followers 0 user_friends 0 user_favourites 0 user_verified 0 date 0 text 0 hashtags 0 source 0 is_retweet 0 Sentiment 0 dtype: int64
print('1.A.II. Eliminate Non-English textual data.')
1.A.II. Eliminate Non-English textual data.
!pip install langdetect
Collecting langdetect Downloading langdetect-1.0.9.tar.gz (981 kB) Requirement already satisfied: six in f:\anaconda3\lib\site-packages (from langdetect) (1.16.0) Building wheels for collected packages: langdetect Building wheel for langdetect (setup.py): started Building wheel for langdetect (setup.py): finished with status 'done' Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=8becf78419c9c23c7dc8c660ea740b89760dc47e62d846f5b56eea89e87fde94 Stored in directory: c:\users\richard\appdata\local\pip\cache\wheels\d1\c1\d9\7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb Successfully built langdetect Installing collected packages: langdetect Successfully installed langdetect-1.0.9
dt.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:55:33 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NoHashtag | Twitter Web App | False | Neutral |
| 1 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:53:17 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NoHashtag | Twitter Web App | False | Neutral |
| 2 | Varun Tyagi | Goa, India | Indian | Tech Solution Artist & Hospitality Ex... | 2009-09-06 10:36:01 | 257 | 204 | 475 | False | 2020-09-16 20:51:57 | Go watch “The Social Dilemma” on Netflix!\n\nI... | NoHashtag | Twitter for iPhone | False | Positive |
| 3 | Casey Conway | Sydney, New South Wales | Head of Diversity & Inclusion @RugbyAU | It's ... | 2012-12-28 21:45:06 | 11782 | 1033 | 12219 | True | 2020-09-16 20:51:46 | I watched #TheSocialDilemma last night. I’m sc... | ['TheSocialDilemma'] | Twitter for iPhone | False | Negative |
| 4 | Charlotte Paul | Darlington | Instagram Charlottejyates | 2012-05-28 20:43:08 | 278 | 387 | 5850 | False | 2020-09-16 20:51:11 | The problem of me being on my phone most the t... | ['TheSocialDilemma'] | Twitter for iPhone | False | Positive |
dt.columns
Index(['user_name', 'user_location', 'user_description', 'user_created',
'user_followers', 'user_friends', 'user_favourites', 'user_verified',
'date', 'text', 'hashtags', 'source', 'is_retweet', 'Sentiment'],
dtype='object')
from langdetect import detect
text_columns = ['user_name', 'user_location', 'user_description', 'text']
dt['language'] = dt[text_columns].apply(lambda row: detect(' '.join(row.values.astype(str))) if row.notnull().all() else '', axis=1)
dt = dt[dt['language'] == 'en']
dt = dt.drop('language', axis=1)
dt = dt.reset_index(drop=True) #Reset the row numbering
dt.shape
(19901, 14)
# After eliminating non-english textual data, number of rows has reduced from 20068 to 19901
dt.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:55:33 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NoHashtag | Twitter Web App | False | Neutral |
| 1 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:53:17 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NoHashtag | Twitter Web App | False | Neutral |
| 2 | Varun Tyagi | Goa, India | Indian | Tech Solution Artist & Hospitality Ex... | 2009-09-06 10:36:01 | 257 | 204 | 475 | False | 2020-09-16 20:51:57 | Go watch “The Social Dilemma” on Netflix!\n\nI... | NoHashtag | Twitter for iPhone | False | Positive |
| 3 | Casey Conway | Sydney, New South Wales | Head of Diversity & Inclusion @RugbyAU | It's ... | 2012-12-28 21:45:06 | 11782 | 1033 | 12219 | True | 2020-09-16 20:51:46 | I watched #TheSocialDilemma last night. I’m sc... | ['TheSocialDilemma'] | Twitter for iPhone | False | Negative |
| 4 | Charlotte Paul | Darlington | Instagram Charlottejyates | 2012-05-28 20:43:08 | 278 | 387 | 5850 | False | 2020-09-16 20:51:11 | The problem of me being on my phone most the t... | ['TheSocialDilemma'] | Twitter for iPhone | False | Positive |
blog['text'].head(10)
0 @musicmadmarc @SocialDilemma_ @netflix @Facebo... 1 @musicmadmarc @SocialDilemma_ @netflix @Facebo... 2 Go watch “The Social Dilemma” on Netflix!\n\nI... 3 I watched #TheSocialDilemma last night. I’m sc... 4 The problem of me being on my phone most the t... 5 #TheSocialDilemma 😳 wow!! We need regulations ... 6 @harari_yuval what do you think about #TheSoci... 7 Erm #TheSocialDilemma makes me want to go off ... 8 #TheSocialDilemma is not a documentary, it's h... 9 Okay i’m watching #TheSocialDilemma now. Name: text, dtype: object
dt['text'].head(10)
0 @musicmadmarc @SocialDilemma_ @netflix @Facebo... 1 @musicmadmarc @SocialDilemma_ @netflix @Facebo... 2 Go watch “The Social Dilemma” on Netflix!\n\nI... 3 I watched #TheSocialDilemma last night. I’m sc... 4 The problem of me being on my phone most the t... 5 #TheSocialDilemma 😳 wow!! We need regulations ... 6 Erm #TheSocialDilemma makes me want to go off ... 7 #TheSocialDilemma is not a documentary, it's h... 8 Okay i’m watching #TheSocialDilemma now. 9 Okey okey, I’ve been peer pressured into watch... Name: text, dtype: object
blog['user_description'].head(10)
0 Premier Facebook Marketing Expert | Social Med... 1 Premier Facebook Marketing Expert | Social Med... 2 Indian | Tech Solution Artist & Hospitality Ex... 3 Head of Diversity & Inclusion @RugbyAU | It's ... 4 Instagram Charlottejyates 5 NaN 6 Küçük küçük şeyler söyler, küçük küçük videola... 7 Mother, optimist, feminist, pacifist, retired ... 8 African🌍 | Music🎶 | Lakers🏀|Manchester United ... 9 IG:@RYANWHITEC 💻Digital Content Creator. 97.9 ... Name: user_description, dtype: object
dt['user_description'].head(10)
0 Premier Facebook Marketing Expert | Social Med... 1 Premier Facebook Marketing Expert | Social Med... 2 Indian | Tech Solution Artist & Hospitality Ex... 3 Head of Diversity & Inclusion @RugbyAU | It's ... 4 Instagram Charlottejyates 5 6 Mother, optimist, feminist, pacifist, retired ... 7 African🌍 | Music🎶 | Lakers🏀|Manchester United ... 8 IG:@RYANWHITEC 💻Digital Content Creator. 97.9 ... 9 Science kid. Herbivore. Opinionated. Tweets ab... Name: user_description, dtype: object
#In the above example, 6th row was elimnated after considering removal of non-english text data
print('***1.B. Write a custom function to plot the count of unique functions in every column***')
***1.B. Write a custom function to plot the count of unique functions in every column***
import matplotlib.pyplot as plt
def plot_unique_value_counts(df):
unique_counts = df.nunique()
column_names = unique_counts.index
count_values = unique_counts.values
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(column_names, count_values)
ax.set_xticks(range(len(column_names))) # Set tick locations
ax.set_xticklabels(column_names, rotation=90)
ax.set_xlabel('Columns')
ax.set_ylabel('Count of Unique Values')
ax.set_title('Count of Unique Values in Each Column')
plt.tight_layout()
plt.show()
plot_unique_value_counts(dt)
print('***1.C. plot for Social Dilemma Sentiment Labels***')
***1.C. plot for Social Dilemma Sentiment Labels***
import matplotlib.pyplot as plt
def plot_unique_counts(df, column_name):
unique_counts = dt[column_name].value_counts()
plt.figure(figsize=(8, 6))
plt.bar(unique_counts.index, unique_counts.values)
plt.xlabel(column_name)
plt.ylabel('Count')
plt.title(f'Count of Unique Values in {column_name}')
plt.show()
plot_unique_counts(dt,'Sentiment')
# The dataset has more positive sentiments compared to negative and neutral sentiments
print('***1.E. Plot and identify the top 20 users, user sources, user locations by number of tweets***')
***1.E. Plot and identify the top 20 users, user sources, user locations by number of tweets***
import plotly.express as px
def plot_top_n(df, column_name, n=20):
top_n = dt[column_name].value_counts().head(n)
fig = px.bar(top_n, x=top_n.index, y=top_n.values, labels={column_name: column_name, 'index': 'Number of Tweets'})
fig.update_layout(title=f'Top {n} {column_name} by Number of Tweets', xaxis_tickangle=-45)
fig.show()
# Use the above function to identify the top 20 users, user sources, user locations
plot_top_n(dt, 'user_name', n=20)
# User 'OurPact' has highest number of tweets of the top 20
plot_top_n(dt, 'source', n=20)
# User source of 'Twitter for iPhone' has highest number of tweets of the top 20
plot_top_n(dt, 'user_location', n=20)
#The above data is interesting as highest tweet has come from unknown locations. This could be most of the people tweeting would not prefer to share their location.However, the next highest is from India
print("***1.E. Take the top 50 user locations based on no of tweets and try to make the format into city, country for these locations. Incase if only city is present we try to map it to the country from the previous data available***")
***1.E. Take the top 50 user locations based on no of tweets and try to make the format into city, country for these locations. Incase if only city is present we try to map it to the country from the previous data available***
#We can use the geopy library to geocode the locations and retrieve the corresponding city and country information.
!pip install geopy
Collecting geopy Downloading geopy-2.3.0-py3-none-any.whl (119 kB) Collecting geographiclib<3,>=1.52 Downloading geographiclib-2.0-py3-none-any.whl (40 kB) Installing collected packages: geographiclib, geopy Successfully installed geographiclib-2.0 geopy-2.3.0
from geopy.geocoders import Nominatim
def format_location(location):
geolocator = Nominatim(user_agent="my_app")
# If location already in "city, country" format
if "," in location:
return location
# Geocode location to get city and country information
location_info = geolocator.geocode(location, timeout=10)
if location_info is not None:
city = location_info.raw.get('address', {}).get('city')
country = location_info.raw.get('address', {}).get('country')
# If city and country are available
if city and country:
return f"{city}, {country}"
# If city and country are not available, return the original location
return location
# Assuming you have a DataFrame called 'df' with a 'user_location' column
top_50_locations = dt['user_location'].value_counts().head(50).index.tolist()
formatted_locations = [format_location(location) for location in top_50_locations]
# Create a new DataFrame with formatted locations
formatted_df = pd.DataFrame({'user_location': formatted_locations})
# Print the formatted DataFrame
print(formatted_df)
user_location 0 Unknown 1 India 2 Mumbai, India 3 Los Angeles, CA 4 London, England 5 San Diego, CA 6 New Delhi, India 7 London 8 United States 9 Canada 10 New York, NY 11 Mumbai 12 Bengaluru, India 13 United Kingdom 14 Hyderabad, India 15 Chicago, IL 16 Pune, India 17 Atlanta, GA 18 California, USA 19 San Francisco, CA 20 Los Angeles 21 Toronto, Ontario 22 Seattle, WA 23 Washington, DC 24 New York, USA 25 Melbourne, Victoria 26 South Africa 27 Dallas, TX 28 New Delhi 29 Boston, MA 30 Philippines 31 Johannesburg, South Africa 32 Portland, OR 33 Earth 34 Chennai, India 35 Austin, TX 36 Texas, USA 37 USA 38 Delhi 39 England, United Kingdom 40 Winchester, VA 41 New York 42 Bangalore 43 UK 44 Nairobi, Kenya 45 Houston, TX 46 Australia 47 London, UK 48 Cape Town, South Africa 49 Singapore
print('***1.F. Plot the count of tweets from every place identified above.***')
***1.F. Plot the count of tweets from every place identified above.***
# Count the number of tweets for each location
location_counts = formatted_df['user_location'].value_counts()
# Plot the count of tweets for each location
plt.figure(figsize=(12, 6))
location_counts.plot(kind='bar')
plt.xlabel('Location')
plt.ylabel('Number of Tweets')
plt.title('Count of Tweets from Each Place')
plt.xticks(rotation=90)
# Display the actual counts above each bar
for i, count in enumerate(location_counts.values):
plt.text(i, count, str(count), ha='center', va='bottom')
plt.show()
# The above data shoes top 50 tweets from each identified location. Some are country in therir respective state, some are only country and 1 unknown
print('***1.G. Get the number of Hashtags present in each tweet and plot the distribution of number of hashtags intweet.***')
***1.G. Get the number of Hashtags present in each tweet and plot the distribution of number of hashtags intweet.***
# Get the number of hashtags in each tweet
dt['Count_Hashtags'] = dt['hashtags'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
# Plot the distribution of the number of hashtags
plt.figure(figsize=(10, 6))
plt.hist(dt['Count_Hashtags'], bins=range(11))
plt.xlabel('Number of Hashtags')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Hashtags in Tweets')
plt.xticks(range(11))
plt.show()
# From the above dats we could find number of hashtags between 1 to 2 has the largest counts
print('***1.H. Plot the daily and hourly distribution of tweets***')
***1.H. Plot the daily and hourly distribution of tweets***
dt.columns
Index(['user_name', 'user_location', 'user_description', 'user_created',
'user_followers', 'user_friends', 'user_favourites', 'user_verified',
'date', 'text', 'hashtags', 'source', 'is_retweet', 'Sentiment',
'count_Hashtags', 'Count_Hashtags'],
dtype='object')
# Convert 'date' column to datetime type
dt['date'] = pd.to_datetime(dt['date'])
# Extract the date and hour information
dt['day'] = dt['date'].dt.date
dt['hour'] = dt['date'].dt.hour
# Calculate the daily and hourly tweet counts
daily_counts = dt['day'].value_counts().sort_index()
hourly_counts = dt['hour'].value_counts().sort_index()
# Plot the daily distribution of tweets
plt.figure(figsize=(12, 6))
daily_counts.plot(kind='line')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.title('Daily Distribution of Tweets')
plt.xticks(daily_counts.index, daily_counts.index.astype(str), rotation=45)
plt.tight_layout()
plt.show()
# Plot the hourly distribution of tweets
plt.figure(figsize=(12, 6))
hourly_counts.plot(kind='bar')
plt.xlabel('Hour')
plt.ylabel('Number of Tweets')
plt.title('Hourly Distribution of Tweets')
plt.xticks(range(24),rotation=45)
plt.tight_layout()
plt.show()
#The following observation are made from the above graphs:
#12th Spetember 2020 has highest number of tweeks.
#More tweets are active in Spetember only. In OCtober 2020, the tweet counts are lower than ~250
#Tweets are generally higher betweeb 6 to 8 PM.
#Tweets are quite active between 1AM to 3 PM, 2 PM to 10 PM
print('***I. Identify the number of users created every year and plot the distribution***')
***I. Identify the number of users created every year and plot the distribution***
# Converting 'user_created' column to datetime type
dt['user_created'] = pd.to_datetime(dt['user_created'])
# Extract the year from 'user_created' column
dt['user_created_year'] = dt['user_created'].dt.year
# Count the number of users for each year
users_created_per_year = dt['user_created_year'].value_counts().sort_index()
# Plot the distribution of users created per year
plt.figure(figsize=(12, 6))
plt.bar(users_created_per_year.index, users_created_per_year.values)
plt.xlabel('Year')
plt.ylabel('Number of Users')
plt.title('Distribution of Users Created per Year')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# From the above graph, it is identified that 2009 has highest number if users created
print('***1.J. Find the top 10 hashtags used in the tweet***')
***1.J. Find the top 10 hashtags used in the tweet***
# Extract the hashtags from 'hashtags' column
hashtags = dt['hashtags'].str.lower().str.split(',')
# Flatten the list of hashtags
hashtags_flat = [tag for sublist in hashtags for tag in sublist]
# Calculate the frequency of each hashtag
top_hashtags = pd.Series(hashtags_flat).value_counts().head(10)
# Print the top 10 hashtags
print(top_hashtags)
['thesocialdilemma'] 12571 nohashtag 4280 ['thesocialdilemma' 1678 'netflix'] 664 'thesocialdilemma'] 639 'thesocialdilemma' 199 ['netflix' 175 'netflix' 138 'socialmedia'] 121 'socialmedia' 58 dtype: int64
# From the above analysis, it is noted that 'thesocialdiemna' hashtag has the highest counts. 4280 tweets does not have hashtag
print('***1.K. Get the number of words in each text and plot the distribution of number of words for each class.***')
***1.K. Get the number of words in each text and plot the distribution of number of words for each class.***
# Calculate the number of words in each text
dt['word_count'] = dt['text'].str.split().apply(len)
# Group the texts by class
grouped = dt.groupby('Sentiment')
# Plot the distribution of number of words for each class
plt.figure(figsize=(10, 6))
for sentiment, group in grouped:
plt.hist(group['word_count'], bins=20, alpha=0.5, label=sentiment)
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Words by Sentiment')
plt.legend()
plt.show()
# From the above graph, the following observatinos are noted:
#Word counts between about 19 to 22 generally tend to have positive sentiment where most of the user tweets are found
#Word counts below 5 tend to be neutral with negligible contribution of negative sentiment
print('***1.L. Plot the word cloud for negative and positive tweets and write your inferences***')
***1.L. Plot the word cloud for negative and positive tweets and write your inferences***
!pip install plotly wordcloud
Requirement already satisfied: plotly in f:\anaconda3\lib\site-packages (5.6.0) Requirement already satisfied: wordcloud in f:\anaconda3\lib\site-packages (1.9.2) Requirement already satisfied: six in f:\anaconda3\lib\site-packages (from plotly) (1.16.0) Requirement already satisfied: tenacity>=6.2.0 in f:\anaconda3\lib\site-packages (from plotly) (8.0.1) Requirement already satisfied: matplotlib in f:\anaconda3\lib\site-packages (from wordcloud) (3.5.1) Requirement already satisfied: pillow in f:\anaconda3\lib\site-packages (from wordcloud) (9.0.1) Requirement already satisfied: numpy>=1.6.1 in f:\anaconda3\lib\site-packages (from wordcloud) (1.21.5) Requirement already satisfied: packaging>=20.0 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (21.3) Requirement already satisfied: fonttools>=4.22.0 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (4.25.0) Requirement already satisfied: pyparsing>=2.2.1 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (3.0.4) Requirement already satisfied: python-dateutil>=2.7 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: kiwisolver>=1.0.1 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.3.2) Requirement already satisfied: cycler>=0.10 in f:\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.11.0)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
# Define a regex pattern to match special characters
pattern = r'[^a-zA-Z0-9\s]'
# Remove special characters from the 'Text' column
dt['text'] = dt['text'].str.replace(pattern, '')
dt['user_description'] = dt['user_description'].str.replace(pattern, '')
dt['hashtags'] = dt['hashtags'].str.replace(pattern, '')
# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt') # Download the tokenization models
# Filter the positive and negative tweets
positive_tweets = dt[dt['Sentiment'] == 'positive']
negative_tweets = dt[dt['Sentiment'] == 'negative']
# Combine all positive and negative tweets into a single string
positive_text = ' '.join(positive_tweets['text'])
negative_text = ' '.join(negative_tweets['text'])
# Tokenize the positive and negative text
positive_tokens = word_tokenize(positive_text)
negative_tokens = word_tokenize(negative_text)
# Remove stopwords from the tokenized text
stopwords = set(stopwords.words('english'))
unwanted_words = ['word1', 'word2', 'word3'] # Add your unwanted words here
positive_tokens = [token.lower() for token in positive_tokens if token.lower() not in stopwords and token.lower() not in unwanted_words]
negative_tokens = [token.lower() for token in negative_tokens if token.lower() not in stopwords and token.lower() not in unwanted_words]
# Calculate word frequency
positive_word_freq = Counter(positive_tokens)
negative_word_freq = Counter(negative_tokens)
# Get the most frequent words and their frequencies
positive_most_common = positive_word_freq.most_common(20)
negative_most_common = negative_word_freq.most_common(20)
# Extract the words and frequencies if available
positive_words, positive_freq = zip(*positive_most_common) if positive_most_common else ([], [])
negative_words, negative_freq = zip(*negative_most_common) if negative_most_common else ([], [])
# Check if there are any words present before plotting the bar plots
if positive_words and positive_words[0]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(x=positive_freq, y=positive_words)
plt.title('Most Frequent Words - Positive Tweets')
plt.xlabel('Frequency')
if negative_words and negative_words[0]:
if not positive_words or not positive_words[0]:
plt.figure(figsize=(6, 6))
plt.subplot(1, 2, 2)
sns.barplot(x=negative_freq, y=negative_words)
plt.title('Most Frequent Words - Negative Tweets')
plt.xlabel('Frequency')
plt.tight_layout()
plt.show()
C:\Users\Richard\AppData\Local\Temp\ipykernel_17056\3271237600.py:15: FutureWarning: The default value of regex will change from True to False in a future version. C:\Users\Richard\AppData\Local\Temp\ipykernel_17056\3271237600.py:16: FutureWarning: The default value of regex will change from True to False in a future version. C:\Users\Richard\AppData\Local\Temp\ipykernel_17056\3271237600.py:17: FutureWarning: The default value of regex will change from True to False in a future version. [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
<Figure size 432x288 with 0 Axes>
# Save DataFrame to CSV
dt.to_csv('G:\\AIML Course Materials\\Projects\\NLP_Additional Project\\blog_dt.csv', index=False)
data=pd.read_csv('G:\\AIML Course Materials\\Projects\\NLP_Additional Project\\blog_dt.csv')
data.shape
(19901, 20)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt') # Download the tokenization models
# Filter the positive and negative tweets
positive_tweets = data[data['Sentiment'] == 'positive']
negative_tweets = data[data['Sentiment'] == 'negative']
# Combine all positive and negative tweets into a single string
positive_text = ' '.join(positive_tweets['text'])
negative_text = ' '.join(negative_tweets['text'])
# Tokenize the positive and negative text
positive_tokens = word_tokenize(positive_text)
negative_tokens = word_tokenize(negative_text)
# Remove stopwords from the tokenized text
stopwords = set(stopwords.words('english'))
unwanted_words = ['word1', 'word2', 'word3'] # Add your unwanted words here
positive_tokens = [token.lower() for token in positive_tokens if token.lower() not in stopwords and token.lower() not in unwanted_words]
negative_tokens = [token.lower() for token in negative_tokens if token.lower() not in stopwords and token.lower() not in unwanted_words]
# Calculate word frequency
positive_word_freq = Counter(positive_tokens)
negative_word_freq = Counter(negative_tokens)
# Get the most frequent words and their frequencies
positive_most_common = positive_word_freq.most_common(20)
negative_most_common = negative_word_freq.most_common(20)
# Extract the words and frequencies if available
positive_words, positive_freq = zip(*positive_most_common) if positive_most_common else ([], [])
negative_words, negative_freq = zip(*negative_most_common) if negative_most_common else ([], [])
# Check if there are any words present before plotting the bar plots
if positive_words and positive_words[0]:
plt.figure(figsize=(50, 6))
plt.subplot(1, 2, 1)
sns.barplot(x=positive_freq, y=positive_words)
plt.title('Most Frequent Words - Positive Tweets')
plt.xlabel('Frequency')
if negative_words and negative_words[0]:
if not positive_words or not positive_words[0]:
plt.figure(figsize=(25, 6))
plt.subplot(1, 2, 2)
sns.barplot(x=negative_freq, y=negative_words)
plt.title('Most Frequent Words - Negative Tweets')
plt.xlabel('Frequency')
#plt.tight_layout()
plt.show()
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
#Pre-process the data using various techniques and libraries
print('***2.A. Eliminate All special Characters and Numbers***')
***2.A. Eliminate All special Characters and Numbers***
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19901 entries, 0 to 19900 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_name 19901 non-null object 1 user_location 19901 non-null object 2 user_description 18382 non-null object 3 user_created 19901 non-null object 4 user_followers 19901 non-null int64 5 user_friends 19901 non-null int64 6 user_favourites 19901 non-null int64 7 user_verified 19901 non-null bool 8 date 19901 non-null object 9 text 19901 non-null object 10 hashtags 19901 non-null object 11 source 19901 non-null object 12 is_retweet 19901 non-null bool 13 Sentiment 19901 non-null object 14 count_Hashtags 19901 non-null int64 15 Count_Hashtags 19901 non-null int64 16 day 19901 non-null object 17 hour 19901 non-null int64 18 user_created_year 19901 non-null int64 19 word_count 19901 non-null int64 dtypes: bool(2), int64(8), object(10) memory usage: 2.8+ MB
data.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | Sentiment | count_Hashtags | Count_Hashtags | day | hour | user_created_year | word_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert Social Medi... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:55:33 | musicmadmarc SocialDilemma netflix Facebook Im... | NoHashtag | Twitter Web App | False | Neutral | 1 | 1 | 2020-09-16 | 20 | 2007 | 16 |
| 1 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert Social Medi... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:53:17 | musicmadmarc SocialDilemma netflix Facebook ha... | NoHashtag | Twitter Web App | False | Neutral | 1 | 1 | 2020-09-16 | 20 | 2007 | 18 |
| 2 | Varun Tyagi | Goa, India | Indian Tech Solution Artist Hospitality Expe... | 2009-09-06 10:36:01 | 257 | 204 | 475 | False | 2020-09-16 20:51:57 | Go watch The Social Dilemma on Netflix\n\nIts ... | NoHashtag | Twitter for iPhone | False | Positive | 1 | 1 | 2020-09-16 | 20 | 2009 | 20 |
| 3 | Casey Conway | Sydney, New South Wales | Head of Diversity Inclusion RugbyAU Its not ... | 2012-12-28 21:45:06 | 11782 | 1033 | 12219 | True | 2020-09-16 20:51:46 | I watched TheSocialDilemma last night Im scare... | TheSocialDilemma | Twitter for iPhone | False | Negative | 1 | 1 | 2020-09-16 | 20 | 2012 | 22 |
| 4 | Charlotte Paul | Darlington | Instagram Charlottejyates | 2012-05-28 20:43:08 | 278 | 387 | 5850 | False | 2020-09-16 20:51:11 | The problem of me being on my phone most the t... | TheSocialDilemma | Twitter for iPhone | False | Positive | 1 | 1 | 2020-09-16 | 20 | 2012 | 17 |
import re
# Define a regex pattern to match special characters and numbers
pattern = r'[^a-zA-Z\s]|\d+'
data['user_description'] = data['user_description'].astype(str)
# Apply the pattern to remove special characters and numbers from the 'text' column
data['text'] = data['text'].apply(lambda x: re.sub(pattern, '', x))
data['user_description'] = data['user_description'].apply(lambda x: re.sub(pattern, '', x))
data['hashtags'] = data['hashtags'].apply(lambda x: re.sub(pattern, '', x))
data['source'] = data['source'].apply(lambda x: re.sub(pattern, '', x))
data['user_name'] = data['user_name'].apply(lambda x: re.sub(pattern, '', x))
data['user_location'] = data['user_location'].apply(lambda x: re.sub(pattern, '', x))
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19901 entries, 0 to 19900 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_name 19901 non-null object 1 user_location 19901 non-null object 2 user_description 19901 non-null object 3 user_created 19901 non-null object 4 user_followers 19901 non-null int64 5 user_friends 19901 non-null int64 6 user_favourites 19901 non-null int64 7 user_verified 19901 non-null bool 8 date 19901 non-null object 9 text 19901 non-null object 10 hashtags 19901 non-null object 11 source 19901 non-null object 12 is_retweet 19901 non-null bool 13 Sentiment 19901 non-null object 14 count_Hashtags 19901 non-null int64 15 Count_Hashtags 19901 non-null int64 16 day 19901 non-null object 17 hour 19901 non-null int64 18 user_created_year 19901 non-null int64 19 word_count 19901 non-null int64 dtypes: bool(2), int64(8), object(10) memory usage: 2.8+ MB
print('***2.B. Remove html tags***')
***2.B. Remove html tags***
data.columns
Index(['user_name', 'user_location', 'user_description', 'user_created',
'user_followers', 'user_friends', 'user_favourites', 'user_verified',
'date', 'text', 'hashtags', 'source', 'is_retweet', 'Sentiment',
'count_Hashtags', 'Count_Hashtags', 'day', 'hour', 'user_created_year',
'word_count'],
dtype='object')
# Define a regular expression pattern to find HTML tags
pattern = r'<[^>]+>'
# Iterate over the object columns to find HTML tags
columns_to_check = ['user_name', 'user_location', 'user_description', 'text', 'hashtags', 'source']
for column in columns_to_check:
# Apply the regular expression pattern to each value in the column
tags = data[column].apply(lambda x: re.findall(pattern, str(x)))
# Print the unique HTML tags found in the column
unique_tags = set([tag for sublist in tags for tag in sublist])
print(f"HTML tags in column '{column}':")
for tag in unique_tags:
print(tag)
print()
HTML tags in column 'user_name': HTML tags in column 'user_location': HTML tags in column 'user_description': HTML tags in column 'text': HTML tags in column 'hashtags': HTML tags in column 'source':
#No html tags identifed. However, proceeding to clean to data
from bs4 import BeautifulSoup
# Remove HTML tags
data['user_location'] = data['user_location'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data['user_description'] = data['user_description'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data['text'] = data['text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data['hashtags'] = data['hashtags'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data['source'] = data['source'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19901 entries, 0 to 19900 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_name 19901 non-null object 1 user_location 19901 non-null object 2 user_description 19901 non-null object 3 user_created 19901 non-null object 4 user_followers 19901 non-null int64 5 user_friends 19901 non-null int64 6 user_favourites 19901 non-null int64 7 user_verified 19901 non-null bool 8 date 19901 non-null object 9 text 19901 non-null object 10 hashtags 19901 non-null object 11 source 19901 non-null object 12 is_retweet 19901 non-null bool 13 Sentiment 19901 non-null object 14 count_Hashtags 19901 non-null int64 15 Count_Hashtags 19901 non-null int64 16 day 19901 non-null object 17 hour 19901 non-null int64 18 user_created_year 19901 non-null int64 19 word_count 19901 non-null int64 dtypes: bool(2), int64(8), object(10) memory usage: 2.8+ MB
# Define a regular expression pattern to find HTML tags
pattern = r'<[^>]+>'
# Iterate over the object columns to find HTML tags
columns_to_check = ['user_name', 'user_location', 'user_description', 'text', 'hashtags', 'source']
for column in columns_to_check:
# Apply the regular expression pattern to each value in the column
tags = data[column].apply(lambda x: re.findall(pattern, str(x)))
# Print the unique HTML tags found in the column
unique_tags = set([tag for sublist in tags for tag in sublist])
print(f"HTML tags in column '{column}':")
for tag in unique_tags:
print(tag)
print()
HTML tags in column 'user_name': HTML tags in column 'user_location': HTML tags in column 'user_description': HTML tags in column 'text': HTML tags in column 'hashtags': HTML tags in column 'source':
print('***2.C. Replace contractions in strings e.g. replace Im --> I am) and so on.***')
***2.C. Replace contractions in strings e.g. replace Im --> I am) and so on.***
def replace_contractions(text):
contraction_mapping = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
for contraction, expanded_form in contraction_mapping.items():
text = text.replace(contraction, expanded_form)
return text
# Iterate over the rows and replace contractions in the specified columns
for index, row in data.iterrows():
data.at[index, 'user_name'] = replace_contractions(row['user_name'])
data.at[index, 'user_location'] = replace_contractions(row['user_location'])
data.at[index, 'user_description'] = replace_contractions(row['user_description'])
data.at[index, 'text'] = replace_contractions(row['text'])
data.at[index, 'hashtags'] = replace_contractions(row['hashtags'])
data.at[index, 'source'] = replace_contractions(row['source'])
# Verify the changes
print(data.head())
user_name user_location \
0 Mari Smith San Diego California
1 Mari Smith San Diego California
2 Varun Tyagi Goa India
3 Casey Conway Sydney New South Wales
4 Charlotte Paul Darlington
user_description user_created \
0 Premier Facebook Marketing Expert Social Medi... 2007-09-11 22:22:51
1 Premier Facebook Marketing Expert Social Medi... 2007-09-11 22:22:51
2 Indian Tech Solution Artist Hospitality Expe... 2009-09-06 10:36:01
3 Head of Diversity Inclusion RugbyAU Its not ... 2012-12-28 21:45:06
4 Instagram Charlottejyates 2012-05-28 20:43:08
user_followers user_friends user_favourites user_verified \
0 579942 288625 11610 False
1 579942 288625 11610 False
2 257 204 475 False
3 11782 1033 12219 True
4 278 387 5850 False
date text \
0 2020-09-16 20:55:33 musicmadmarc SocialDilemma netflix Facebook Im...
1 2020-09-16 20:53:17 musicmadmarc SocialDilemma netflix Facebook ha...
2 2020-09-16 20:51:57 Go watch The Social Dilemma on Netflix\n\nIts ...
3 2020-09-16 20:51:46 I watched TheSocialDilemma last night Im scare...
4 2020-09-16 20:51:11 The problem of me being on my phone most the t...
hashtags source is_retweet Sentiment count_Hashtags \
0 NoHashtag Twitter Web App False Neutral 1
1 NoHashtag Twitter Web App False Neutral 1
2 NoHashtag Twitter for iPhone False Positive 1
3 TheSocialDilemma Twitter for iPhone False Negative 1
4 TheSocialDilemma Twitter for iPhone False Positive 1
Count_Hashtags day hour user_created_year word_count
0 1 2020-09-16 20 2007 16
1 1 2020-09-16 20 2007 18
2 1 2020-09-16 20 2009 20
3 1 2020-09-16 20 2012 22
4 1 2020-09-16 20 2012 17
print(blog['text'].iloc[3])
I watched #TheSocialDilemma last night. I’m scared for humanity. I’m not sure what to do but I’ve logged out of F… https://t.co/luOBcjCJFb
print(data['text'].iloc[3])
I watched TheSocialDilemma last night Im scared for humanity Im not sure what to do but Ive logged out of F httpstcoluOBcjCJFb
#In the above example, stricng contraction and html tag removal is witnessed after cleaning
print('***2.D. Remove the URL’s***')
***2.D. Remove the URL’s***
# Remove URLs from specific column
data['user_location'] = data['user_location'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
data['user_description'] = data['user_description'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
data['text'] = data['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
data['hashtags'] = data['hashtags'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
data['source'] = data['source'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
print(blog['text'].iloc[3])
I watched #TheSocialDilemma last night. I’m scared for humanity. I’m not sure what to do but I’ve logged out of F… https://t.co/luOBcjCJFb
print(data['text'].iloc[3])
I watched TheSocialDilemma last night Im scared for humanity Im not sure what to do but Ive logged out of F
#In the above example, URL removal is effective after cleaning
print('***2.E. Remove the mentions in the tweets (@)***')
***2.E. Remove the mentions in the tweets (@)***
# Remove the mentions in the twwets'@' from specific column
data['user_location'] = data['user_location'].apply(lambda x: re.sub(r'@\w+', '', x))
data['user_description'] = data['user_description'].apply(lambda x: re.sub(r'@\w+', '', x))
data['text'] = data['text'].apply(lambda x: re.sub(r'@\w+', '', x))
data['hashtags'] = data['hashtags'].apply(lambda x: re.sub(r'@\w+', '', x))
data['source'] = data['source'].apply(lambda x: re.sub(r'@\w+', '', x))
blog.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:55:33 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NaN | Twitter Web App | False | Neutral |
| 1 | Mari Smith | San Diego, California | Premier Facebook Marketing Expert | Social Med... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:53:17 | @musicmadmarc @SocialDilemma_ @netflix @Facebo... | NaN | Twitter Web App | False | Neutral |
| 2 | Varun Tyagi | Goa, India | Indian | Tech Solution Artist & Hospitality Ex... | 2009-09-06 10:36:01 | 257 | 204 | 475 | False | 2020-09-16 20:51:57 | Go watch “The Social Dilemma” on Netflix!\n\nI... | NaN | Twitter for iPhone | False | Positive |
| 3 | Casey Conway | Sydney, New South Wales | Head of Diversity & Inclusion @RugbyAU | It's ... | 2012-12-28 21:45:06 | 11782 | 1033 | 12219 | True | 2020-09-16 20:51:46 | I watched #TheSocialDilemma last night. I’m sc... | ['TheSocialDilemma'] | Twitter for iPhone | False | Negative |
| 4 | Charlotte Paul | Darlington | Instagram Charlottejyates | 2012-05-28 20:43:08 | 278 | 387 | 5850 | False | 2020-09-16 20:51:11 | The problem of me being on my phone most the t... | ['TheSocialDilemma'] | Twitter for iPhone | False | Positive |
data.head()
| user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | is_retweet | Sentiment | count_Hashtags | Count_Hashtags | day | hour | user_created_year | word_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mari Smith | San Diego California | Premier Facebook Marketing Expert Social Medi... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:55:33 | musicmadmarc SocialDilemma netflix Facebook Im... | NoHashtag | Twitter Web App | False | Neutral | 1 | 1 | 2020-09-16 | 20 | 2007 | 16 |
| 1 | Mari Smith | San Diego California | Premier Facebook Marketing Expert Social Medi... | 2007-09-11 22:22:51 | 579942 | 288625 | 11610 | False | 2020-09-16 20:53:17 | musicmadmarc SocialDilemma netflix Facebook ha... | NoHashtag | Twitter Web App | False | Neutral | 1 | 1 | 2020-09-16 | 20 | 2007 | 18 |
| 2 | Varun Tyagi | Goa India | Indian Tech Solution Artist Hospitality Expe... | 2009-09-06 10:36:01 | 257 | 204 | 475 | False | 2020-09-16 20:51:57 | Go watch The Social Dilemma on Netflix\n\nIts ... | NoHashtag | Twitter for iPhone | False | Positive | 1 | 1 | 2020-09-16 | 20 | 2009 | 20 |
| 3 | Casey Conway | Sydney New South Wales | Head of Diversity Inclusion RugbyAU Its not ... | 2012-12-28 21:45:06 | 11782 | 1033 | 12219 | True | 2020-09-16 20:51:46 | I watched TheSocialDilemma last night Im scare... | TheSocialDilemma | Twitter for iPhone | False | Negative | 1 | 1 | 2020-09-16 | 20 | 2012 | 22 |
| 4 | Charlotte Paul | Darlington | Instagram Charlottejyates | 2012-05-28 20:43:08 | 278 | 387 | 5850 | False | 2020-09-16 20:51:11 | The problem of me being on my phone most the t... | TheSocialDilemma | Twitter for iPhone | False | Positive | 1 | 1 | 2020-09-16 | 20 | 2012 | 17 |
#From the above example, it is observed @ has been removed in text column after cleaning
print('***2.F. Remove all Stopwords***')
***2.F. Remove all Stopwords***
import nltk
from nltk.corpus import stopwords
# Download stopwords if not already downloaded
nltk.download('stopwords')
# Remove stopwords from the 'text' column
stopwords = set(stopwords.words('english'))
data['user_description'] = data['user_description'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
data['user_location'] = data['user_location'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
data['hashtags'] = data['hashtags'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
data['source'] = data['source'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
print(blog['text'].iloc[1])
@musicmadmarc @SocialDilemma_ @netflix @Facebook haa, hey Marc. I get what you're saying & don't agree. 🤪 Whicheve… https://t.co/nsVtPHjUs8
print(data['text'].iloc[1])
musicmadmarc SocialDilemma netflix Facebook haa hey Marc get youre saying amp dont agree Whicheve
# Th above example explains after stopwords I, what removal
print('***2.G. Lowercase all textual data***')
***2.G. Lowercase all textual data***
data['user_name']=data['user_name'].str.lower()
data['user_description']=data['user_description'].str.lower()
data['text']=data['text'].str.lower()
data['hashtags']=data['hashtags'].str.lower()
data['source']=data['source'].str.lower()
print(blog['user_name'].iloc[1])
Mari Smith
print(data['user_name'].iloc[1])
mari smith
print(blog['user_description'].iloc[1])
Premier Facebook Marketing Expert | Social Media Thought Leader | Keynote Speaker | Dynamic Live Video Host | Ambassador | 🇨🇦🏴🇺🇸
print(data['user_description'].iloc[1])
premier facebook marketing expert social media thought leader keynote speaker dynamic live video host ambassador
# From the above examples, it is observed that lower case has been effective after cleaning of data
print('***2.H. Perform tokenization, lemmatization, normalization appropriately***')
***2.H. Perform tokenization, lemmatization, normalization appropriately***
!pip install --upgrade nltk
Requirement already satisfied: nltk in f:\anaconda3\lib\site-packages (3.7)
Collecting nltk
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Requirement already satisfied: tqdm in f:\anaconda3\lib\site-packages (from nltk) (4.64.0)
Requirement already satisfied: joblib in f:\anaconda3\lib\site-packages (from nltk) (1.2.0)
Requirement already satisfied: click in f:\anaconda3\lib\site-packages (from nltk) (8.0.4)
Requirement already satisfied: regex>=2021.8.3 in f:\anaconda3\lib\site-packages (from nltk) (2022.3.15)
Requirement already satisfied: colorama in f:\anaconda3\lib\site-packages (from click->nltk) (0.4.4)
Installing collected packages: nltk
Attempting uninstall: nltk
Found existing installation: nltk 3.7
Uninstalling nltk-3.7:
Successfully uninstalled nltk-3.7
Successfully installed nltk-3.8.1
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
# Download required resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))
def tokenize_and_lemmatize(text):
tokens = word_tokenize(text)
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stopwords_set]
return lemmatized_tokens
# Tokenize and lemmatize the text column
data['text_tokens_Lem'] = data['text'].apply(tokenize_and_lemmatize)
# Tokenize and lemmatize the user_description column
data['user_description_tokens_lem'] = data['user_description'].apply(tokenize_and_lemmatize)
# Print the updated dataset
print(data[['text', 'text_tokens_Lem','user_description','user_description_tokens_lem']].head())
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to [nltk_data] C:\Users\Richard\AppData\Roaming\nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
text \
0 musicmadmarc socialdilemma netflix facebook im...
1 musicmadmarc socialdilemma netflix facebook ha...
2 go watch social dilemma netflix best minutes y...
3 watched thesocialdilemma last night im scared ...
4 problem phone time trying watch thesocialdilemma
text_tokens_Lem \
0 [musicmadmarc, socialdilemma, netflix, faceboo...
1 [musicmadmarc, socialdilemma, netflix, faceboo...
2 [go, watch, social, dilemma, netflix, best, mi...
3 [watched, thesocialdilemma, last, night, im, s...
4 [problem, phone, time, trying, watch, thesocia...
user_description \
0 premier facebook marketing expert social media...
1 premier facebook marketing expert social media...
2 indian tech solution artist hospitality expert...
3 head diversity inclusion rugbyau tan im aborig...
4 instagram charlottejyates
user_description_tokens_lem
0 [premier, facebook, marketing, expert, social,...
1 [premier, facebook, marketing, expert, social,...
2 [indian, tech, solution, artist, hospitality, ...
3 [head, diversity, inclusion, rugbyau, tan, im,...
4 [instagram, charlottejyates]
# The above code is applied to tokenenize and lemmatize the text data. Now we shall apply normalize on the above data
def normalize_text(tokens):
normalized_tokens = []
for token in tokens:
# Lowercase the token
normalized_token = token.lower()
# Remove punctuation and special characters
normalized_token = re.sub(r'[^a-zA-Z0-9]', '', normalized_token)
# Add the normalized token to the list
normalized_tokens.append(normalized_token)
return normalized_tokens
# Apply text normalization to the 'text_tokens_Lem' column
data['text_normalized'] = data['text_tokens_Lem'].apply(normalize_text)
# Apply text normalization to the 'user_description_tokens_lem' column
data['user_description_normalized'] = data['user_description_tokens_lem'].apply(normalize_text)
# Print the updated dataset
print(data[['text', 'text_normalized','user_description','user_description_normalized']].head())
text \
0 musicmadmarc socialdilemma netflix facebook im...
1 musicmadmarc socialdilemma netflix facebook ha...
2 go watch social dilemma netflix best minutes y...
3 watched thesocialdilemma last night im scared ...
4 problem phone time trying watch thesocialdilemma
text_normalized \
0 [musicmadmarc, socialdilemma, netflix, faceboo...
1 [musicmadmarc, socialdilemma, netflix, faceboo...
2 [go, watch, social, dilemma, netflix, best, mi...
3 [watched, thesocialdilemma, last, night, im, s...
4 [problem, phone, time, trying, watch, thesocia...
user_description \
0 premier facebook marketing expert social media...
1 premier facebook marketing expert social media...
2 indian tech solution artist hospitality expert...
3 head diversity inclusion rugbyau tan im aborig...
4 instagram charlottejyates
user_description_normalized
0 [premier, facebook, marketing, expert, social,...
1 [premier, facebook, marketing, expert, social,...
2 [indian, tech, solution, artist, hospitality, ...
3 [head, diversity, inclusion, rugbyau, tan, im,...
4 [instagram, charlottejyates]
print('***I. Remove the hashtags***')
***I. Remove the hashtags***
def remove_hashtags(text):
# Check if the input is a string or bytes-like object
if isinstance(text, str):
# Remove hashtags using regular expressions
text_without_hashtags = re.sub(r'#\w+', '', text)
return text_without_hashtags
else:
# Return empty string if the input is not a string
return ''
# Apply hashtag removal to the 'text_normalized' column
data['text_without_hashtags'] = data['text_normalized'].apply(lambda x: remove_hashtags(str(x)))
# Apply hashtag removal to the 'user_description_normalized' column
data['user_description_without_hashtags'] = data['user_description_normalized'].apply(lambda x: remove_hashtags(str(x)))
# Print the updated dataset
print(data[['text', 'text_without_hashtags','user_description','user_description_without_hashtags']].head(10))
text \
0 musicmadmarc socialdilemma netflix facebook im...
1 musicmadmarc socialdilemma netflix facebook ha...
2 go watch social dilemma netflix best minutes y...
3 watched thesocialdilemma last night im scared ...
4 problem phone time trying watch thesocialdilemma
5 thesocialdilemma wow need regulations social m...
6 erm thesocialdilemma makes want go grid live c...
7 thesocialdilemma documentary horror live
8 okay im watching thesocialdilemma
9 okey okey ive peer pressured watching thesocia...
text_without_hashtags \
0 ['musicmadmarc', 'socialdilemma', 'netflix', '...
1 ['musicmadmarc', 'socialdilemma', 'netflix', '...
2 ['go', 'watch', 'social', 'dilemma', 'netflix'...
3 ['watched', 'thesocialdilemma', 'last', 'night...
4 ['problem', 'phone', 'time', 'trying', 'watch'...
5 ['thesocialdilemma', 'wow', 'need', 'regulatio...
6 ['erm', 'thesocialdilemma', 'make', 'want', 'g...
7 ['thesocialdilemma', 'documentary', 'horror', ...
8 ['okay', 'im', 'watching', 'thesocialdilemma']
9 ['okey', 'okey', 'ive', 'peer', 'pressured', '...
user_description \
0 premier facebook marketing expert social media...
1 premier facebook marketing expert social media...
2 indian tech solution artist hospitality expert...
3 head diversity inclusion rugbyau tan im aborig...
4 instagram charlottejyates
5 nan
6 mother optimist feminist pacifist retired deli...
7 african music lakersmanchester united dark cho...
8 igryanwhitec digital content creator beat open...
9 science kid herbivore opinionated tweets cultu...
user_description_without_hashtags
0 ['premier', 'facebook', 'marketing', 'expert',...
1 ['premier', 'facebook', 'marketing', 'expert',...
2 ['indian', 'tech', 'solution', 'artist', 'hosp...
3 ['head', 'diversity', 'inclusion', 'rugbyau', ...
4 ['instagram', 'charlottejyates']
5 ['nan']
6 ['mother', 'optimist', 'feminist', 'pacifist',...
7 ['african', 'music', 'lakersmanchester', 'unit...
8 ['igryanwhitec', 'digital', 'content', 'creato...
9 ['science', 'kid', 'herbivore', 'opinionated',...
print('***3. Build a base Classification model***')
***3. Build a base Classification model***
print('***3.A. Create dependent and independent variables***')
***3.A. Create dependent and independent variables***
data.columns
Index(['user_name', 'user_location', 'user_description', 'user_created',
'user_followers', 'user_friends', 'user_favourites', 'user_verified',
'date', 'text', 'hashtags', 'source', 'is_retweet', 'Sentiment',
'count_Hashtags', 'Count_Hashtags', 'day', 'hour', 'user_created_year',
'word_count', 'text_tokens', 'user_description_tokens',
'text_tokens_Lem', 'user_description_tokens_lem', 'text_normalized',
'user_description_normalized', 'text_without_hashtags',
'user_description_without_hashtags'],
dtype='object')
# Dependent Variable
y = data['Sentiment']
# Independent Variables
x = data[['user_description_without_hashtags','text_without_hashtags']]
print('***3.B. Split data into train and test***')
***3.B. Split data into train and test***
from sklearn.model_selection import train_test_split
# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print('***3.C. Vectorize data using any one vectorizer, so that we can feed the data in the model***')
***3.C. Vectorize data using any one vectorizer, so that we can feed the data in the model***
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the training data
x_train_vectorized = vectorizer.fit_transform(x_train)
# Transform the testing data
x_test_vectorized = vectorizer.transform(x_test)
#Using TFid Vectorizer for traiign the model
print('***D. Build a base model for Supervised Learning - Classification***')
***D. Build a base model for Supervised Learning - Classification***
# Check the dimensions of the train and test sets
print("Training set shape:", x_train.shape, y_train.shape)
print("Testing set shape:", x_test.shape, y_test.shape)
Training set shape: (15920, 2) (15920,) Testing set shape: (3981, 2) (3981,)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
# Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train['text_without_hashtags'])
x_test_vectorized = vectorizer.transform(x_test['text_without_hashtags'])
# Convert the target variable into numeric labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
# Build the base model using Logistic Regression
model = LogisticRegression(max_iter=1000)
# Train the model
logreg=model.fit(x_train_vectorized, y_train_encoded)
# Make predictions on the test set
y_pred_encoded = model.predict(x_test_vectorized)
# Convert the predicted labels back to original labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy,2))
Accuracy: 0.87
#In the base model, the accuracy is 87%.This needs to be imporved further with alternate models or hyper tuning of parameters
print('***3.E. Clearly print Performance Metrics.***')
***3.E. Clearly print Performance Metrics.***
from sklearn.metrics import classification_report
# Make predictions on the test set
y_pred = model.predict(x_test_vectorized)
# Print the classification report
print(classification_report(y_test_encoded, y_pred))
precision recall f1-score support
0 0.86 0.69 0.76 660
1 0.84 0.95 0.89 1405
2 0.91 0.89 0.90 1916
accuracy 0.87 3981
macro avg 0.87 0.84 0.85 3981
weighted avg 0.88 0.87 0.87 3981
y_train.value_counts()
Positive 7541 Neutral 5482 Negative 2897 Name: Sentiment, dtype: int64
#The following observations are made based on the above report:
# Recall & f1-score are low for Positive Sentiment
# Overall score in neutral 7 negative sentiments are above 85%. However, this could be imporved with hyper-tuning/alterante models
# Average in all 3 classes are above 80%. However, we shall try to find opportunities to improve further.
print('***3.F. Improve performance of the model and mention which parameter/hyperparameter significantly helped to improve performance and its probable reason***')
***3.F. Improve performance of the model and mention which parameter/hyperparameter significantly helped to improve performance and its probable reason***
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# Define the hyperparameters and their possible values
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10, 100], # Regularization strength
'penalty': ['l1', 'l2'], # Regularization penalty type
'solver': ['liblinear', 'saga'] # Optimization algorithm
}
# Create the logistic regression model
model = LogisticRegression(max_iter=2000)
# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train_vectorized, y_train_encoded)
# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)
# Use the best hyperparameters to train the final model
best_model = LogisticRegression(**best_params)
best_model.fit(x_train_vectorized, y_train_encoded)
# Evaluate the model on the test set
y_pred_gridCV = best_model.predict(x_test_vectorized)
print("Performance Metrics:")
print(classification_report(y_test_encoded, y_pred_gridCV))
F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
Best Hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Performance Metrics:
precision recall f1-score support
0 0.88 0.72 0.79 660
1 0.85 0.97 0.91 1405
2 0.93 0.90 0.91 1916
accuracy 0.89 3981
macro avg 0.89 0.86 0.87 3981
weighted avg 0.89 0.89 0.89 3981
F:\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
print('***3.G. Try at least three different models and identify which model performs best. Print and plot Confusion matirx to get an idea of how the distribution of the prediction is among all the classes. Write your inferences on the same.***')
***3.G. Try at least three different models and identify which model performs best. Print and plot Confusion matirx to get an idea of how the distribution of the prediction is among all the classes. Write your inferences on the same.***
from sklearn.svm import SVC
# Create the SVM model
svm = SVC()
# Train the model
svm.fit(x_train_vectorized, y_train_encoded)
# Predict on the test set
y_pred_svm = svm.predict(x_test_vectorized)
# Evaluate the model
print("SVM Performance Metrics:")
print(classification_report(y_test_encoded, y_pred_svm))
SVM Performance Metrics:
precision recall f1-score support
0 0.88 0.51 0.64 660
1 0.78 0.97 0.86 1405
2 0.89 0.86 0.87 1916
accuracy 0.84 3981
macro avg 0.85 0.78 0.79 3981
weighted avg 0.85 0.84 0.83 3981
from sklearn.naive_bayes import MultinomialNB
# Create the Naive Bayes model
naive_bayes = MultinomialNB()
# Train the model
naive_bayes.fit(x_train_vectorized, y_train_encoded)
# Predict on the test set
y_pred_naive_bayes = naive_bayes.predict(x_test_vectorized)
# Evaluate the model
print("Naive Bayes Performance Metrics:")
print(classification_report(y_test_encoded, y_pred_naive_bayes))
Naive Bayes Performance Metrics:
precision recall f1-score support
0 0.78 0.45 0.57 660
1 0.85 0.70 0.77 1405
2 0.72 0.92 0.81 1916
accuracy 0.77 3981
macro avg 0.79 0.69 0.72 3981
weighted avg 0.78 0.77 0.76 3981
from sklearn.ensemble import RandomForestClassifier
# Create the Random Forest model
random_forest = RandomForestClassifier()
# Train the model
random_forest.fit(x_train_vectorized, y_train_encoded)
# Predict on the test set
y_pred_random_forest = random_forest.predict(x_test_vectorized)
# Evaluate the model
print("Random Forest Performance Metrics:")
print(classification_report(y_test_encoded, y_pred_random_forest))
Random Forest Performance Metrics:
precision recall f1-score support
0 0.87 0.52 0.66 660
1 0.80 0.96 0.87 1405
2 0.88 0.87 0.87 1916
accuracy 0.84 3981
macro avg 0.85 0.78 0.80 3981
weighted avg 0.85 0.84 0.84 3981
print('***3.H. Wordcloud of top 40 important features from the final model chosen***')
***3.H. Wordcloud of top 40 important features from the final model chosen***
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from sklearn.inspection import permutation_importance
result = permutation_importance(logreg, x_test_vectorized.toarray(), y_test_encoded, n_repeats=10, random_state=42)
feature_importance_scores = result.importances_mean
# Sort the features based on their importance scores
sorted_features = sorted(feature_importance_scores.items(), key=lambda x: x[1], reverse=True)
# Select the top 40 features
top_features = dict(sorted_features[:40])
# Generate a word cloud visualization
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(top_features)
# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()